# SETWD ===========================
rm(list=ls())


# LIBRARIES ===========================

library(tidyverse)
library(foreign)
library(stargazer)
library(lmtest)
library(multiwayvcov)

# IMPORT DATA ===========================

dfgi <- read.csv("data/sve-08_04_24.csv",stringsAsFactors = F)

# PREDICTIVENESS ===========================

## Figure A9: ROC Curve ----

### Gini + GDP ----
temp <- dfgi%>%
  filter(!is.na(gini_disp)&!is.na(gdppc))

m1 <- glm(erosion.strict ~ gini_disp + log(gdppc), 
          data=temp, family="binomial")
temp$pred1 <- predict(m1, temp, type="response")
pos.scores <- temp%>%
  filter(!is.na(pred1))%>%
  filter(erosion.strict==1)%>%
  pull(pred1)
neg.scores <- temp%>%
  filter(!is.na(pred1))%>%
  filter(erosion.strict==0)%>%
  pull(pred1)
thresholds <- seq(0,1,0.001)
df.roc <- data.frame(thresholds)%>%
  mutate(fpr=NA,tpr=NA,model="econ")
for(i in 1:length(thresholds)){
  df.roc$fpr[i] <- mean(neg.scores>thresholds[i])
  df.roc$tpr[i] <- mean(pos.scores>thresholds[i])
}

### Year ----
m1 <- glm(erosion.strict ~ year, 
          data=temp, family="binomial")
temp$pred1 <- predict(m1, temp, type="response")
pos.scores <- temp%>%
  filter(!is.na(pred1))%>%
  filter(erosion.strict==1)%>%
  pull(pred1)
neg.scores <- temp%>%
  filter(!is.na(pred1))%>%
  filter(erosion.strict==0)%>%
  pull(pred1)
df.roc2 <- data.frame(thresholds)%>%
  mutate(fpr=NA,tpr=NA,model="year")
for(i in 1:length(thresholds)){
  df.roc2$fpr[i] <- mean(neg.scores>thresholds[i])
  df.roc2$tpr[i] <- mean(pos.scores>thresholds[i])
}

### Gini + GDP + Year ----
m1 <- glm(erosion.strict ~ gini_disp + log(gdppc) + year, 
          data=temp, family="binomial")
temp$pred1 <- predict(m1, temp, type="response")
pos.scores <- temp%>%
  filter(!is.na(pred1))%>%
  filter(erosion.strict==1)%>%
  pull(pred1)
neg.scores <- temp%>%
  filter(!is.na(pred1))%>%
  filter(erosion.strict==0)%>%
  pull(pred1)
df.roc3 <- data.frame(thresholds)%>%
  mutate(fpr=NA,tpr=NA,model="year.econ")


### Merge and plot ----
for(i in 1:length(thresholds)){
  df.roc3$fpr[i] <- mean(neg.scores>thresholds[i])
  df.roc3$tpr[i] <- mean(pos.scores>thresholds[i])
}

df.roc <- rbind(df.roc,df.roc2,df.roc3)


ggplot(df.roc,aes(x=fpr,y=tpr,color=model))+
  geom_abline(slope=1,intercept=0,color="gray80",linetype=2)+
  geom_line(linewidth=0.6)+
  xlab("False Positive Rate")+
  ylab("True Positive Rate")+
  scale_color_manual(name="Model",
                     labels=c("Gini + GDP",
                              "Year",
                              "Gini + GDP + Year"),
                     values=c("#5ab4bd","#b5d63d","#1c445e"))+
  scale_x_continuous(expand = c(0.01,0))+
  scale_y_continuous(expand = c(0,0.01))+
  theme_classic()

ggsave("figures/roc-comb2.png",width=4.5,height=3)



## AUC estimates ----

auc <- function(temp){
  pos.scores <- temp%>%
    filter(!is.na(pred1))%>%
    filter(erosion.strict==1)%>%
    pull(pred1)
  
  neg.scores <- temp%>%
    filter(!is.na(pred1))%>%
    filter(erosion.strict==0)%>%
    pull(pred1)
  
  # estimate AUC
  set.seed(135102)
  reps <- 1000000
  mean(sample(pos.scores,reps,replace=T) > sample(neg.scores,reps,replace=T))
}

### Base models ----
m1 <- glm(erosion.strict ~ gini_disp, 
          data=temp, family="binomial")
temp$pred1 <- predict(m1, temp, type="response")
auc(temp)

m1 <- glm(erosion.strict ~ gini_disp + log(gdppc), 
          data=temp, family="binomial")
temp$pred1 <- predict(m1, temp, type="response")
auc(temp)

m1 <- glm(erosion.strict ~ log(gdppc), 
          data=temp, family="binomial")
temp$pred1 <- predict(m1, temp, type="response")
auc(temp)

m1 <- glm(erosion.strict ~ gini_disp + log(gdppc) + year, 
          data=temp, family="binomial")
temp$pred1 <- predict(m1, temp, type="response")
auc(temp)

m1 <- glm(erosion.strict ~ year, 
          data=temp, family="binomial")
temp$pred1 <- predict(m1, temp, type="response")
auc(temp)
